{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "color = sns.color_palette()\n",
    "%matplotlib inline\n",
    "\n",
    "import jieba\n",
    "import re\n",
    "from sklearn.model_selection import StratifiedKFold,KFold\n",
    "from sklearn import metrics\n",
    "import random\n",
    "\n",
    "import fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "data_path = '../input/train_first.csv'\n",
    "df = pd.read_csv(data_path,header = 0, encoding='utf-8')\n",
    "\n",
    "test_data_path = '../input/predict_first.csv'\n",
    "test_df = pd.read_csv(test_data_path,header = 0, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "stop_word = []\n",
    "stop_words_path = '../input/stop_word.txt'\n",
    "with open(stop_words_path,encoding='utf-8') as f:\n",
    "    for line in f.readlines():\n",
    "        stop_word.append(line.strip())\n",
    "stop_word.append(' ')\n",
    "\n",
    "def clean_str(stri):\n",
    "    stri = re.sub(r'[a-zA-Z0-9]+','',stri)\n",
    "    cut_str = jieba.cut(stri.strip())\n",
    "    list_str = [word for word in cut_str if word not in stop_word]\n",
    "    stri = ' '.join(list_str)\n",
    "    return stri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 3.814 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "df['Discuss'] = df['Discuss'].map(lambda x : clean_str(x))\n",
    "test_df['Discuss'] = test_df['Discuss'].map(lambda x : clean_str(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def fillnull(x):\n",
    "    if x == '':\n",
    "        return '空白'\n",
    "    else:\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "df['Discuss'] = df['Discuss'].map(lambda x: fillnull(x))\n",
    "test_df['Discuss'] = test_df['Discuss'].map(lambda x: fillnull(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Discuss</th>\n",
       "      <th>Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>201e8bf2-77a2-3a98-9fcf-4ce03914e712</td>\n",
       "      <td>好大 一个 游乐 公园 已经 次 感觉 玩够 第三 第四次</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>f4d51947-eac4-3005-9d3c-2f32d6068a2d</td>\n",
       "      <td>新 中国 成立 举行 中国 人 来说 重要 深刻 意义</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>74aa7ae4-03a4-394c-bee0-5702d3a3082a</td>\n",
       "      <td>庐山 瀑布 有名 多个 瀑布 最 好看 非 三叠 泉莫属 推荐 一去</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>099661c2-4360-3c49-a2fe-8c783764f7db</td>\n",
       "      <td>觉得 颐和园 北京 最值 一起 地方 相比 下 门票 最贵 故宫 雄伟 气势磅礴 颐和园 宁...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>97ca672d-e558-3542-ba7b-ee719bba1bab</td>\n",
       "      <td>迪斯尼 一日游</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Id  \\\n",
       "0  201e8bf2-77a2-3a98-9fcf-4ce03914e712   \n",
       "1  f4d51947-eac4-3005-9d3c-2f32d6068a2d   \n",
       "2  74aa7ae4-03a4-394c-bee0-5702d3a3082a   \n",
       "3  099661c2-4360-3c49-a2fe-8c783764f7db   \n",
       "4  97ca672d-e558-3542-ba7b-ee719bba1bab   \n",
       "\n",
       "                                             Discuss  Score  \n",
       "0                      好大 一个 游乐 公园 已经 次 感觉 玩够 第三 第四次      5  \n",
       "1                        新 中国 成立 举行 中国 人 来说 重要 深刻 意义      4  \n",
       "2                 庐山 瀑布 有名 多个 瀑布 最 好看 非 三叠 泉莫属 推荐 一去      4  \n",
       "3  觉得 颐和园 北京 最值 一起 地方 相比 下 门票 最贵 故宫 雄伟 气势磅礴 颐和园 宁...      5  \n",
       "4                                            迪斯尼 一日游      5  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def fasttext_data(data,label):\n",
    "    fasttext_data = []\n",
    "    for i in range(len(label)):\n",
    "        sent = data[i]+\"\\t__label__\"+str(int(label[i]))\n",
    "        fasttext_data.append(sent)\n",
    "    with open('../input/train.txt','w', encoding = 'utf_8_sig') as f:\n",
    "        for data in fasttext_data:\n",
    "            f.write(data)\n",
    "            f.write('\\n')\n",
    "    return '../input/train.txt'\n",
    "\n",
    "def get_predict(pred):\n",
    "    score = np.array([1,2,3,4,5])\n",
    "    pred2 = []\n",
    "    for p in pred:\n",
    "        pr = np.sum(p * score)\n",
    "        pred2.append(pr)\n",
    "    return np.array(pred2)\n",
    "\n",
    "def rmsel(true_label,pred):\n",
    "    rmse = np.sqrt(metrics.mean_squared_error(true_label, pred))\n",
    "    return rmse / (1 + rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def fast_cv(df):\n",
    "    X = df['Discuss'].values\n",
    "    y = df['Score'].values\n",
    "    fast_pred = []\n",
    "    folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y))\n",
    "    for train_index, test_index in folds:\n",
    "        X_train, X_test = X[train_index], X[test_index]\n",
    "        y_train, y_test = y[train_index], y[test_index]\n",
    "        train_file = fasttext_data(X_train,y_train)\n",
    "        classifier = fasttext.supervised(train_file,'model',lr=0.01,dim=128,label_prefix=\"__label__\", encoding = 'utf-8-sig')\n",
    "        \n",
    "        result = classifier.predict_proba(df.loc[test_index,'Discuss'].tolist(),k=5)\n",
    "        print(result[0:100])\n",
    "        pred = [[int(sco) * proba for sco,proba in result_i] for result_i in result]\n",
    "        pred = [sum(pred_i) for pred_i in pred]\n",
    "        print(pred[0:100])\n",
    "        print(rmsel(y_test,pred))\n",
    "        \n",
    "        test_result = classifier.predict_proba(test_df['Discuss'].tolist(),k=5)\n",
    "        fast_predi = [[int(sco) * proba for sco,proba in result_i] for result_i in test_result]\n",
    "        fast_predi = [sum(pred_i) for pred_i in fast_predi]\n",
    "        fast_pred.append(fast_predi)\n",
    "    \n",
    "    fast_pred = np.array(fast_pred)\n",
    "    fast_pred = np.mean(fast_pred, axis=0)\n",
    "    return fast_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "test_pred1 = fast_cv(df)\n",
    "print(test_pred1[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
